## ----------------------------------------------------------------------
##
## Analyses for Appendix B thru K 
##
## ----------------------------------------------------------------------

## environment setting
Sys.setenv(LANGUAGE="en")
gc();gc()
rm(list = ls())
options(scipen = 999)   ## Disable scientific notation

## ----------------------------------------------------------------------
##
## Initial settings
##
## ----------------------------------------------------------------------
## Set directory
## ----------------------------------------------------------------------
root_dir = "YOUR_DIRECTORY/Replication_Folder"

## Load packages and functions
source(file.path(root_dir, "2_Code/1_PackagesFunctions.R"))

## ----------------------------------------------------------------------
## Data directory
data_dir = root_dir	%>%	file.path("1_Data")	%>%	dir_create()
## Output directory
output_dir = root_dir	%>%	file.path("3_Result")	%>%	dir_create()
figure_dir = root_dir	%>%	file.path("4_Figures")	%>%	dir_create()
## Working directory
working_dir = root_dir	%>%	file.path("5_Working")	%>%	dir_create()
## ----------------------------------------------------------------------

## additional packages for this file
needs("ppcor") ## for correlation matrix
needs("readxl") ## to read xlsx
needs("lsr") ## for correlate function
needs("GGally") ## for ggpairs



## ----------------------------------------------------------------------
##
## Table B1
## Regress aiming point dummies on the covariates
##
## ----------------------------------------------------------------------
## Load data
## ----------------------------------------------------------------------
## --- Census data
tokyo_census_tbl = data_dir	%>%	
  # file.path("TokyoCensusRaid_May2020.rds")	%>%	
  # read_rds()	%>%
  file.path("census_data4.csv") %>%
  read_csv(col_types = cols()) %>%
  select(row_id, everything())	%>%
  filter(year == 2010)	%>%	drop_na(ratio_damage)	%>%
  mutate(key_code = as.character(key_code)) %>%
  ## --- Combine the aiming point table
  left_join(
    ## --- Aiming points
    data_dir %>%	file.path("PairwiseDistance_Poly2AP.csv")	%>%
      read_csv(col_types = cols())	%>%	select(-row_id)	%>%
      mutate(key_code = as.character(key_code)),
    by = "key_code")	%>%
  rownames_to_column(var = "row_count")	%>%
  mutate(DistrictFE = as.factor(prewar_district))

## ----------------------------------------------------------------------
##
## Variables and models
##
## ----------------------------------------------------------------------
## Outcome
outcome = "AimingPointDummy"
## ----------------------------------------------------------------------
## Covariates
## ----------------------------------------------------------------------
## District FE
fe_term = "DistrictFE"
## Geographical features
spatial_covariates = c(
  ## Residential ratio and geographical area
  "ratio_residential", "poly_area_ln", "n_neighbor",
  ## Prewar population density
  "PopDensity_1939_km2_ln",
  ## Terrain variables
  "mean_elevation_ln", "mean_slope_ln", "river_distance_ln",
  ## Railway and train stations
  "railway_length_ln", "n_stations", "SpLag_railway_length_ln", "SpLag_n_stations"
)
## ----------------------------------------------------------------------
## AP-related variables
far_ap_dummy = c("MoreThanMaxMinAP_PairwiseDist")
ap_smoothing_termz = list(
  str_c("poly(", c("log(AimingPointDistance_km_1)", "log(AimingPointDistance_km_2)"), ", degree = 2, simple = TRUE, raw = TRUE)"),
  str_c("poly(", c("log(AimingPointDistance_km_1)", "log(AimingPointDistance_km_2)"), ", degree = 3, simple = TRUE, raw = TRUE)")
)
## ----------------------------------------------------------------------
## Location variables
distance_termz_base = c(
  # "minTargetDistance_ln",
  "palace_dist_ln"
)
polynomial_degree = 5
## --- Polynomials and splines (single dimension)
distance_term_specification = list(
  ## --- Polynomials
  str_c("poly(", distance_termz_base, ", degree = ", polynomial_degree, ", simple = TRUE, raw = TRUE)"),
  ## --- Splines
  str_c("s(", distance_termz_base, ")"),
  ## --- none (comparison purposes)
  "")
## --- Lon-lat (two-dimensional) term
spatial_term_specification = c(
  ## --- Polynomials
  str_c("poly(z_lon, z_lat, degree = ",  polynomial_degree, ", simple = TRUE, raw = TRUE)"),
  ## --- Splines
  "te(longitude, latitude)")
## ----------------------------------------------------------------------

## ----------------------------------------------------------------------
## Formula objects
## ----------------------------------------------------------------------
mdl_lst = list(
  ## Polynomial: felm()-compatible specification
  polynomial_base_felm = chr2fml_felm(
    outcome,
    idv_list = list(
      spatial_covariates,
      distance_term_specification[[1]],
      spatial_term_specification[1]),
    fe_list = fe_term,
    se_cluster = "row_count"	## Robust SE
    # se_cluster = fe_term	## Cluster SE
  ),
  ## Polynomial: felm()-compatible specification
  polynomialsquared_felm = chr2fml_felm(
    outcome,
    idv_list = list(
      far_ap_dummy, ap_smoothing_termz[[1]],
      spatial_covariates,
      distance_term_specification[[1]],
      spatial_term_specification[1]),
    fe_list = fe_term,
    se_cluster = "row_count"	## Robust SE
    # se_cluster = fe_term	## Cluster SE
  ),
  ## Polynomial: felm()-compatible specification
  polynomialcubic_felm = chr2fml_felm(
    outcome,
    idv_list = list(
      far_ap_dummy, ap_smoothing_termz[[2]],
      spatial_covariates,
      distance_term_specification[[1]],
      spatial_term_specification[1]),
    fe_list = fe_term,
    se_cluster = "row_count"	## Robust SE
    # se_cluster = fe_term	## Cluster SE
  )
)

## ----------------------------------------------------------------------
## Result list object
## ----------------------------------------------------------------------
rslt_lst = list(
  ## ----------------------------------------------------------------------
  ## 1: Baseline model w/o any AP info
  simple_mod = mdl_lst[[1]]	%>%	felm(data = tokyo_census_tbl, keepCX = TRUE, psdef = FALSE, weights = tokyo_census_tbl$pop),
  ## ----------------------------------------------------------------------
  ## 2: w/ far AP dummy and squared terms
  squared_mod = mdl_lst[[2]]	%>%	felm(data = tokyo_census_tbl, keepCX = TRUE, psdef = FALSE, weights = tokyo_census_tbl$pop),
  ## ----------------------------------------------------------------------
  ## 3: w/ far AP dummy and cubic terms
  cubic_mod = mdl_lst[[3]]	%>%	felm(data = tokyo_census_tbl, keepCX = TRUE, psdef = FALSE, weights = tokyo_census_tbl$pop)
  ## ----------------------------------------------------------------------
)
## ----------------------------------------------------------------------
## Print regression estimates
## ----------------------------------------------------------------------
stargazer(
  rslt_lst,
  type = "text",
  single.row = TRUE,
  align = TRUE,
  covariate.labels = c(
    "MoreThanMaxMinAP_PairwiseDist" = "Beyond max(Shortest distance between neighboring aiming points)",
    "poly(log(AimingPointDistance_km_1), degree = 2, simple = TRUE, raw = TRUE)1"="ln(Aiming point distance)",
    "poly(log(AimingPointDistance_km_1), degree = 2, simple = TRUE, raw = TRUE)2"="ln(Aiming point distance)sq",
    "poly(log(AimingPointDistance_km_2), degree = 2, simple = TRUE, raw = TRUE)1"="ln(Second nearest AP distance)",
    "poly(log(AimingPointDistance_km_2), degree = 2, simple = TRUE, raw = TRUE)2"="ln(Second nearest AP distance)sq",
    "poly(log(AimingPointDistance_km_1), degree = 3, simple = TRUE, raw = TRUE)1"="ln(Aiming point distance)",
    "poly(log(AimingPointDistance_km_1), degree = 3, simple = TRUE, raw = TRUE)2"="ln(Aiming point distance)sq",
    "poly(log(AimingPointDistance_km_1), degree = 3, simple = TRUE, raw = TRUE)3"="ln(Aiming point distance)cu",
    "poly(log(AimingPointDistance_km_2), degree = 3, simple = TRUE, raw = TRUE)1"="ln(Second nearest AP distance)",
    "poly(log(AimingPointDistance_km_2), degree = 3, simple = TRUE, raw = TRUE)2"="ln(Second nearest AP distance)sq",
    "poly(log(AimingPointDistance_km_2), degree = 3, simple = TRUE, raw = TRUE)3"="ln(Second nearest AP distance)cu",
    "ratio_residential" ="Residential ratio",
    "poly_area_ln" ="ln(Area)",
    "n_neighbor" ="Number of neighbor polygons",
    "PopDensity_1939_km2_ln" ="ln(Population density (1939))",
    "mean_elevation_ln" ="ln(Mean elevation)",
    "mean_slope_ln" ="ln(Mean slope)",
    "river_distance_ln" ="ln(River distance)",
    "railway_length_ln" ="ln(Railway length)",
    "n_stations" ="Number of stations",
    "SpLag_railway_length_ln" ="ln(Railway length), spatial lag",
    "SpLag_n_stations" ="Number of stations, spatial lag",
    "poly(palace_dist_ln, degree = 5, simple = TRUE, raw = TRUE)1" ="ln(Imperial Palace distance)",
    "poly(palace_dist_ln, degree = 5, simple = TRUE, raw = TRUE)2" ="ln(Imperial Palace distance)sq",
    "poly(palace_dist_ln, degree = 5, simple = TRUE, raw = TRUE)3" ="ln(Imperial Palace distance)cu",
    "poly(palace_dist_ln, degree = 5, simple = TRUE, raw = TRUE)4" ="ln(Imperial Palace distance)4th",
    "poly(palace_dist_ln, degree = 5, simple = TRUE, raw = TRUE)5" ="ln(Imperial Palace distance)5th"
    )
)

## ----------------------------------------------------------------------
##
## Figure C123: balance checks
##
## ----------------------------------------------------------------------

## ----------------------------------------------------------------------
## Prepare data objects
## ----------------------------------------------------------------------
## Read data
tokyo2keep = data_dir	%>%
  file.path("census_data4.csv") %>%
  read_csv(col_types = cols()) %>%
  drop_na(ratio_damage)

## ----------------------------------------------------------------------
## Prepare variables
## ----------------------------------------------------------------------
## Dependent variables
## ----------------------------------------------------------------------
## Logged DVs
depvar_vec = c(
  "lnr_unemp", "lnrm_unemp", "lnrf_unemp",
  "lnr_proexe", "lnave_lvlength", "lnave_eduyr"
)
## ----------------------------------------------------------------------
## Logged negative controls
negative_cntrlz = c("lnrmale0004")
## ----------------------------------------------------------------------
## Dependent variable vector (all DVs and negative controls combined): 22 DVs
depvar_vec_combined = c(depvar_vec, negative_cntrlz)
## Dependent variables: for alternative treatment indicators
depvar_vec_alternative_treatment = c(depvar_vec, negative_cntrlz)

## ----------------------------------------------------------------------
## Treatment variables
## ----------------------------------------------------------------------
treatment_vec = c("ratio_damage", "ln_damage_ratio", "binary_damage_ratio")
robust_treatment_vec = c(treatment_vec[2:3], str_c("as.factor(", treatment_vec[1],")"))
## ----------------------------------------------------------------------
## Pretreatment covariates
## ----------------------------------------------------------------------
## Distance variables
distance_termz_base = c("palace_dist_ln", "minTargetDistance_ln")
## Polynomial
polynomial_degree = 5
# distance_termz_poly = polynomial_varname(distance_termz_base, degree = polynomial_degree)
distance_termz_poly = str_c("poly(", distance_termz_base, ", degree = ", polynomial_degree, ", simple = TRUE, raw = TRUE)")
distance_term_specification = list( distance_termz_poly, str_c("s(", distance_termz_base, ")") )
## Geographical features
spatial_covariates = c(
  ## Residential ratio and geographical area
  "ratio_residential", "poly_area_ln", "n_neighbor",
  ## Prewar population density
  "PopDensity_1939_km2_ln",
  ## Terrain variables
  "mean_elevation_ln", "mean_slope_ln", "river_distance_ln",
  ## Railway
  "railway_length_ln", "n_stations", "SpLag_railway_length_ln", "SpLag_n_stations"
)
## Flammability score
flame_scorez = c("hazard", "max_nghb_hazard")
## ----------------------------------------------------------------------
## Spatial spline/polynomials and fixed effects
## ----------------------------------------------------------------------
## District FEs
district_fe = "as.factor(prewar_district)"	## 35-district FE
## Spatial terms
spatial_term_lbl = c("Polynomial", "GAM")	## This indicates polynomial or spline specification in the csv files
lonlat_base = c("z_lon", "z_lat")
spatial_polynomial = str_c("poly(z_lon, z_lat, degree = ",  polynomial_degree, ", simple = TRUE, raw = TRUE)")
spatial_term_specification = list(spatial_polynomial, "te(longitude, latitude)")
## ----------------------------------------------------------------------
## Pretreatment covariates to plot for balance checking
balance_varz = c(spatial_covariates, distance_termz_base, spatial_polynomial[1:2])



## ----------------------------------------------------------------------
## Figure parameters
## ----------------------------------------------------------------------
set.seed(123)
fig_height = 3.85
base_cex = .75
ax_lbl_cex = 0.8
fig_mrgn = c(3, 3, 1, 0) + 0.3   ## figure margin: bottom, left, top, right
boxplot_colz = c(
  adjustcolor("lightsteelblue2", alpha = 0.5),
  adjustcolor("lightsteelblue4", alpha = 0.85),
  adjustcolor("maroon2", alpha = 0.65)
)
## ----------------------------------------------------------------------
## Prepare data
## ----------------------------------------------------------------------
boxplot_data = tokyo2keep %>%
  ## 2010 observation because the spatial variables are based on the 2010 census polygons
  filter(year == 2010)    %>%
  select(-year)
boxplot_obs = boxplot_data  %>%
  group_by(ratio_damage)  %>%
  summarize(n_obs = n())  %>%
  select(n_obs)   %>%
  as.matrix   %>%
  as.vector
## ----------------------------------------------------------------------
## Figure C2 & C3: Balance Check --- 
## pretreatment cov & sp-lagged pretreatment cov
## * See below for Figure C1
## ----------------------------------------------------------------------
balance_varz_lbl = c(
  "Residential Ratio", "ln Area", "N Neighboring Census Units", "ln Elevation", "ln Slope",
  "ln River Distance (km)", "ln Railway Length (km)", "ln N Stations", "ln Railway Length (km) Spatial Lag", "ln N Stations Spatial Lag",
  "ln Palace Distance (km)", "ln min(Target Distance) (km)",
  "Longitude", "Latitude"
  # "ln Palace Distance (km, standardized)", "ln min(Target Distance) (km, standardized)", "Longitude (standardized)", "Latitude (standardized)"
)

## ----------------------------------------------------------------------
## Residualized plot
## ----------------------------------------------------------------------
png_resolution = 360
fig_height = 3.85
fig_mrgn = c(2.45, 2.35, 0.3, 0.3) + 0.3   ## figure margin: bottom, left, top, right
xlimz = c(-3,3)
ylimz = c(-6,6)
base_cex = 1.2
ax_lbl_cex = 1.25
## Pretreatment covariates
pretreatment_combined = c(spatial_covariates, distance_term_specification[[1]], spatial_term_specification[[1]], district_fe)

## Figure name label
bc_label = c("fig_c2_a", "fig_c2_e", "fig_c3_a", "fig_c2_b", "fig_c2_f", "fig_c2_g", "fig_c2_h", "fig_c2_i", "fig_c2_j", "fig_c3_b", "fig_c3_c", "fig_c2_c", "fig_c2_d")

## Loop over covariates
for (i in 1:(length(balance_varz)-2)) {

  ## Print progress
  cat(str_c("Pretreatment covariate: ", i, "/", (length(balance_varz)-2), " --- done. \r") )
  ## ----------------------------------------------------------------------
  ## Pull out covariate
  tmp_covariate = balance_varz[i] ## ith covariate
  tmp_pretreatment = pretreatment_combined[!str_detect(pretreatment_combined, tmp_covariate)]   ## Drop ith pretreatment covariate
  ## Prepare data
  residual_raid_fml = chr2fml("ratio_damage", idv_list = list(tmp_pretreatment))
  residual_x_fml = chr2fml(tmp_covariate, idv_list = list(tmp_pretreatment))
  ## Pull out residuals
  residual_raid = lm(residual_raid_fml,
                     data = boxplot_data %>%
                       mutate_at(
                         vars("ratio_damage", tmp_covariate),
                         funs((.-mean(.))/sd(.))
                       ),
                     weights = pop
  )   %>% residuals
  residual_x = lm(residual_x_fml,
                  data = boxplot_data %>%
                    mutate_at(
                      vars("ratio_damage", tmp_covariate),
                      funs((.-mean(.))/sd(.))
                    ),
                  weights = pop
  )   %>% residuals
  ## ----------------------------------------------------------------------
  ## Figure path
  # tmp_path = file.path(figure_dir, str_c(i, "_", tmp_covariate, "_residualized.png"))
  tmp_path = file.path(figure_dir, str_c(bc_label[i], ".png"))
  png(tmp_path, width = fig_height, height = sqrt(2)*fig_height, units = "in", res = 240)
  # dev.new(width = fig_height, height = sqrt(2)*fig_height)
  par(cex = base_cex, mar = fig_mrgn)
  ## ----------------------------------------------------------------------
  ## Scatter plot
  pt_cex_weight = boxplot_data$pop/1000
  plot(residual_raid, y = residual_x,
       pch = 21, bg = boxplot_colz[1], col = boxplot_colz[2], lwd = 0.25,
       cex = .25*pt_cex_weight,
       xlim = xlimz, ylim = ylimz,
       xaxs = "i", yaxs = "i", axes = FALSE, xlab = NA, ylab = NA
  )
  ## ----------------------------------------------------------------------
  ## Regress and add notations
  ## Zero-reference
  abline(h=0, col = adjustcolor("black", alpha = 0.5), lwd = 0.65)
  ## Regression slope
  tmp_res_line = lm(residual_x ~ residual_raid, weights = boxplot_data$pop)
  abline(tmp_res_line, col = "maroon2")
  tmp_info = model_info_lm(tmp_res_line, x2extract = "residual_raid")
  tmp_res_lbl = tmp_info[1] %>% round(3)
  tmp_res_lbl = str_c("Slope = ", tmp_res_lbl)
  text(x = max(residual_raid)*0.1, y = 5.25, labels = tmp_res_lbl, cex = 1, col = "maroon2", pos = 3)

  ## Axis etc.
  for (j in 1:2) {
    # tick = seq(-2, 2, by = 0.2)
    axis(j, line = 0, label = NA)
    axis(j, line = 0, label = NA, at = seq(-6,6, by = 0.5), tck = -0.02)
    axis(j, line = 0, label = NA, at = seq(-6,6, by = 0.1), tck = -0.01)
    axis(j, line = -0.5, tick = FALSE, at = seq(-6,6, by = 1))
  }
  mtext("Raid Damage (residualized)", side = 1, line = 1.65, cex = ax_lbl_cex)
  mtext(str_c(balance_varz_lbl[i], " (residualized)"), side = 2, line = 1.65, cex = ax_lbl_cex)
  box()
  ## Close
  dev.off()
}

## ----------------------------------------------------------------------
## Figure C1: Balance Check --- Fire hazard
## ----------------------------------------------------------------------
hazard_varz = c("hazard", "min_nghb_hazard", "max_nghb_hazard")
## Pretreatment covariates --- EXCLUDING the neighbor hazard ratings
pretreatment_combined = c(spatial_covariates, distance_term_specification[[1]], spatial_term_specification[[1]])
## Prepare data
hazard_data = tokyo2keep   %>%
  ## 2010 observation because the spatial variables are based on the 2010 census polygons
  filter(
    year == 2010, !is.na(hazard),
    !is.na(min_nghb_hazard), !is.na(max_nghb_hazard)
  )   %>%
  select(-year)
## Variable names in figures
hazard_varz_lbl = c("Fire Hazard", "min(Neighbor Fire Hazard)", "max(Neighbor Fire Hazard)")

## Figure name label
bc_label2 = c("fig_c1_a", "fig_c1_b", "fig_c1_c")

## Loop over covariates
for (i in 1:length(hazard_varz)) {
  ## Print progress
  cat(str_c("Pretreatment covariate: ", i, "/", length(hazard_varz), " --- done. \r") )
  ## ----------------------------------------------------------------------
  ## Pull out covariate
  tmp_covariate = hazard_varz[i] ## ith covariate
  tmp_pretreatment = pretreatment_combined[!str_detect(pretreatment_combined, tmp_covariate)]   ## Drop ith pretreatment covariate
  ## Prepare data
  residual_raid_fml = chr2fml("ratio_damage", idv_list = list(tmp_pretreatment))
  residual_x_fml = chr2fml(tmp_covariate, idv_list = list(tmp_pretreatment))
  ## Pull out residuals
  residual_raid = lm(residual_raid_fml,
                     data = hazard_data %>%
                       mutate_at(
                         vars("ratio_damage", tmp_covariate),
                         funs((.-mean(.))/sd(.))
                       ),
                     weights = pop
  )   %>% residuals
  residual_x = lm(residual_x_fml,
                  data = hazard_data %>%
                    mutate_at(
                      vars("ratio_damage", tmp_covariate),
                      funs((.-mean(.))/sd(.))
                    ),
                  weights = pop
  )   %>% residuals
  ## ----------------------------------------------------------------------
  ## Figure path
  # tmp_path = file.path(figure_dir, str_c(i + length(balance_varz)-2, "_", tmp_covariate, "_residualized.png"))
  tmp_path = file.path(figure_dir, str_c(bc_label2[i], ".png"))
  png(tmp_path, width = fig_height, height = sqrt(2)*fig_height, units = "in", res = 240)
  # dev.new(width = fig_height, height = sqrt(2)*fig_height)
  par(cex = base_cex, mar = fig_mrgn)
  ## ----------------------------------------------------------------------
  ## Scatter plot
  pt_cex_weight = hazard_data$pop/1000
  plot(residual_raid, y = residual_x,
       pch = 21, bg = boxplot_colz[1], col = boxplot_colz[2], lwd = 0.25,
       cex = .25*pt_cex_weight,
       xlim = xlimz, ylim = ylimz,
       xaxs = "i", yaxs = "i", axes = FALSE, xlab = NA, ylab = NA
  )
  ## ----------------------------------------------------------------------
  ## Regress and add notations
  ## Zero-reference
  abline(h=0, col = adjustcolor("black", alpha = 0.5), lwd = 0.65)
  ## Regression slope
  tmp_res_line = lm(residual_x ~ residual_raid, weights = hazard_data$pop)
  abline(tmp_res_line, col = "maroon2")
  tmp_info = model_info_lm(tmp_res_line, x2extract = "residual_raid")
  tmp_res_lbl = tmp_info[1] %>% round(3)
  tmp_res_lbl = str_c("Slope = ", tmp_res_lbl)
  ## Add text
  text(x = max(residual_raid)*0.1, y = 5.25, labels = tmp_res_lbl, cex = 1, col = "maroon2", pos = 3)
  ## Raw regression
  # tmp_res_line = lm(chr2fml(tmp_covariate, idv_list = list("ratio_damage")), data = hazard_data)
  # abline(tmp_res_line, col = boxplot_colz[3], lty = "dashed")
  ## Axis etc.
  for (j in 1:2) {
    # tick = seq(-2, 2, by = 0.2)
    axis(j, line = 0, label = NA)
    axis(j, line = 0, label = NA, at = seq(-6,6, by = 0.5), tck = -0.02)
    axis(j, line = 0, label = NA, at = seq(-6,6, by = 0.1), tck = -0.01)
    axis(j, line = -0.5, tick = FALSE, at = seq(-6,6, by = 1))
  }
  mtext("Raid Damage (residualized)", side = 1, line = 1.65, cex = ax_lbl_cex)
  mtext(str_c(hazard_varz_lbl[i], " (residualized)"), side = 2, line = 1.65, cex = ax_lbl_cex)
  box()
  ## Close
  dev.off()
}


## ----------------------------------------------------------------------
##
## Figure E1: Histogram of the number of pictures taken on each day.
## Actual figure was produced by Stata and this code produces equivalent one.
## 
## ----------------------------------------------------------------------

## LOAD MASTER DATA
data = data_dir %>%
  file.path("Aerial_Photo_Freq.dta")	%>%
  read_dta()

# Create the histogram using ggplot2
plot <- ggplot(data, aes(x = date2)) +
  geom_histogram(binwidth = 2, fill = "steelblue", color = NA) +
  labs(title = "Number of Photos in the Dataset",
       x = "Date",
       y = "Frequency") +
  theme_minimal() +
  theme(plot.background = element_rect(fill = "white"),
        panel.background = element_rect(fill = "white"),
        panel.grid = element_blank(),
        axis.line = element_line(color = "black"))

# Save the plot as a PDF
ggsave(figure_dir %>% file.path("figure_e1.pdf"), plot, width = 8, height = 5)



## ----------------------------------------------------------------------
##
## Figure H3
## correlation matrix for the intercoder validity
##
## ----------------------------------------------------------------------

## ----------------------------------------------------------------------
## Load data
## ----------------------------------------------------------------------
## human coding shape file
raid_shape = data_dir	%>%
  list.files(pattern = "raid_shape.shp$", full.names = TRUE)	%>%
  read_sf() %>%
  rename(key_code = key_cod) %>%
  ## create damage from map variable
  mutate(dmg_map = NA)

## WARINING: THE FOLLOWING LOOP TAKES ABOUT ONE HOUR TO FINISH.
## loop over 3 layers RGB
for (b in 1:3){
  ## crop using raster package
  for (i in 1:dim(raid_shape)[1]){
    raid_shape[i,9] = 
      data_dir %>%
      list.files(pattern = "georeffed_red.tif$", full.names = TRUE) %>%
      ## load red part of damage map
      raster(band = b) %>% 
      crop(extent(raid_shape[i,])) %>%
      mask(raid_shape[i,]) %>%
      ## need to use raster package's as.matrix()
      raster::as.matrix() %>%
      as.numeric() %>%
      mean(na.rm = T)
    cat("i =", i, "\r")
  }
  write_rds(raid_shape,file = file.path(working_dir, str_c("raid_shape_dmg_map_band", b,".rds")))
}

## human coding data file
humancoder = data_dir	%>%
  list.files(pattern = "eval1.xlsx$", full.names = TRUE)	%>%
  read_excel(sheet="eval1") %>%
  mutate(dmg_coder1 = damage/10) %>%
  select(row_id, dmg_coder1)

## master data file
raid_data = data_dir	%>%
  file.path("census_data4.csv") %>%
  read_csv(col_types = cols()) %>%
  mutate(key_code = as.character(key_code)) 

## trim master data
raid_data_s = raid_data %>%
  filter(year==2010) %>%
  select(key_code, row_id, ratio_damage, ratio_residential) %>%
  left_join(humancoder, by = "row_id")

## map based damage variable
raid_merged = 
  ## band=1
  working_dir	%>%
  file.path("raid_shape_dmg_map_band1.rds")	%>%
  read_rds() %>%
  select(key_code,dmg_map) %>%
  rename(dmg_map_b1 = dmg_map) %>%
  ## merge band=2
  left_join(
    working_dir	%>%
      file.path("raid_shape_dmg_map_band2.rds")	%>%
      read_rds() %>%
      as_tibble() %>%
      select(key_code,dmg_map) %>%
      mutate(dmg_map = -1*dmg_map) %>%
      rename(dmg_map_b2 = dmg_map),
    by = "key_code"  
  ) %>%
  ## merge band=3
  left_join(
    working_dir	%>%
      file.path("raid_shape_dmg_map_band3.rds")	%>%
      read_rds() %>%
      as_tibble() %>%
      select(key_code,dmg_map) %>%
      mutate(dmg_map = -1*dmg_map) %>%
      rename(dmg_map_b3 = dmg_map),
    by = "key_code"  
  ) %>% 
  ## adding up all color elements
  mutate(dmg_map_b123 = dmg_map_b1+dmg_map_b2+dmg_map_b3) %>%
  ## drop duplicated entries
  group_by(key_code,dmg_map_b123) %>%
  slice_head() %>%
  ungroup() %>%
  ## merge human code data
  left_join(raid_data_s, by = "key_code") %>%
  ## map based
  mutate(dmg_map_b123_0to1 = (dmg_map_b123-min(dmg_map_b123))/(max(dmg_map_b123)-min(dmg_map_b123))) %>%
  ## original
  mutate(ratio_dmg = ratio_damage * ratio_residential) %>%
  ## student
  mutate(ratio_dmg_coder1 = dmg_coder1 * ratio_residential)


# delete unnecessary variables to make ggpair
raid_merged_s = raid_merged %>%
  as_tibble() %>%
  select(ratio_dmg, dmg_map_b123_0to1, ratio_dmg_coder1) 



## ----------------------------------------------------------------------
## CALCULATE CORRELATION AND GRAPH
## ----------------------------------------------------------------------

## produce correlogram in PDF with ggpairs
file_cor_mat = figure_dir %>% 
  file.path(str_c("cor_mat.pdf"))
pdf(file_cor_mat, height = 3, width = 3)
g <- ggpairs(raid_merged_s,
             lower = list(
               continuous = wrap("points", 
                                 size = .5,
                                 color = "blue", 
                                 alpha = 0.05)),
             columnLabels = c("Author", "Map", "Student")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(g)
dev.off()

## Code for partial correlations in Appendix H
raid_merged_s2 = raid_merged_s %>%
  filter(!is.na(ratio_dmg)) %>%
  filter(!is.na(dmg_map_b123_0to1)) %>%
  filter(!is.na(ratio_dmg_coder1))

pcor(raid_merged_s2)$estimate




## ----------------------------------------------------------------------
##
## Figure I1
##
## ----------------------------------------------------------------------

## ----------------------------------------------------------------------
## Analysis for Ordinary household population
## ----------------------------------------------------------------------

## Load data set
df = data_dir	%>%
  file.path("keishicho-tokeisho.xlsx")	%>%
  readxl::read_xlsx(sheet="1940-1944")

##
## Male population
##

## Calculate correlations between 1940 & 1944
cor(df$futsupopm1940,df$futsupopm1944)

## Creating scatter plots
ggplot(df, aes(x=futsupopm1940, y=futsupopm1944)) + 
  geom_smooth(method=lm, se=T, color='magenta', size=.1) +
  geom_point(alpha=0.33, color='skyblue3', stroke=0, size=3) +
  ylim(-5000, 120000) +
  xlim(0, 125000) +
  labs(x="Population as of December 1940",
       y="Population as of December 1944") + 
  theme(legend.title=element_blank(),
        legend.key=element_rect(fill='NA')) +
  annotate("text", x = 90000, y = 2500,
           label = expression("N=80, " ~ rho ~ " = 0.95"))

ggsave(file.path(figure_dir, "a_male_ordinaryHH.pdf"), 
       width = 4, 
       height = 4)

##
## Female population
##

## Calculate correlations between 1940 & 1944
cor(df$futsupopf1940,df$futsupopf1944)

## Creating scatter plots
ggplot(df, aes(x=futsupopf1940, y=futsupopf1944)) + 
  geom_smooth(method=lm, se=T, color='magenta', size=.1) +
  geom_point(alpha=0.33, color='skyblue3', stroke=0, size=3) +
  ylim(-5000, 120000) +
  xlim(0, 125000) +
  labs(x="Population as of December 1940",
       y="Population as of December 1944") + 
  theme(legend.title=element_blank(),
        legend.key=element_rect(fill='NA')) +
  annotate("text", x = 90000, y = 2500,
           label = expression("N=80, " ~ rho ~ " = 0.96"))

ggsave(file.path(figure_dir, "b_female_ordinaryHH.pdf"), 
       width = 4, 
       height = 4)


## ----------------------------------------------------------------------
## Analysis for Non-ordinary household population
## ----------------------------------------------------------------------

## calculate non-ordinary household popuation
df = df %>%
  mutate(junpopm1940 = popm1940 - futsupopm1940) %>%
  mutate(junpopf1940 = popf1940 - futsupopf1940) %>%
  mutate(junpopm1944 = popm1944 - futsupopm1944) %>%
  mutate(junpopf1944 = popf1944 - futsupopf1944) 

##
## Male population
##

## Calculate correlations between 1940 & 1944
cor(df$junpopm1940,df$junpopm1944)

## Creating scatter plots
ggplot(df, aes(x=junpopm1940, y=junpopm1944)) + 
  geom_smooth(method=lm, se=T, color='magenta', size=.1) +
  geom_point(alpha=0.33, color='skyblue3', stroke=0, size=3) +
  # ylim(0, 250000) +
  # xlim(0, 250000) +
  labs(x="Population as of December 1940",
       y="Population as of December 1944") + 
  theme(legend.title=element_blank(),
        legend.key=element_rect(fill='NA')) +
  annotate("text", x = 9000, y = 600,
           label = expression("N=80, " ~ rho ~ " = 0.87")) +
  ## adding label for outliers: Kamata
  geom_label(
    label="Kamata", 
    x=10000,
    y=20000,
    label.padding = unit(0.3, "lines"), # Rectangle size around label
    label.size = 0.35,
    color = "black",
    fill="#FFFFFF",
    size = 3
  ) 

ggsave(file.path(figure_dir, "c_male_nonordinaryHH.pdf"), 
       width = 4, 
       height = 4)

##
## Female population
##

## Calculate correlations between 1940 & 1944
cor(df$junpopf1940,df$junpopf1944)

## Creating scatter plots
ggplot(df, aes(x=junpopf1940, y=junpopf1944)) + 
  geom_smooth(method=lm, se=T, color='magenta', size=.1) +
  geom_point(alpha=0.33, color='skyblue3', stroke=0, size=3) +
  # ylim(0, 250000) +
  # xlim(0, 250000) +
  labs(x="Population as of December 1940",
       y="Population as of December 1944") + 
  theme(legend.title=element_blank(),
        legend.key=element_rect(fill='NA')) +
  annotate("text", x = 2750, y = -300,
           label = expression("N=80, " ~ rho ~ " = 0.65")) +
  ## adding label for outliers: Kamata
  geom_label(
    label="Kamata", 
    x=2680,
    y=7800,
    label.padding = unit(0.3, "lines"), # Rectangle size around label
    label.size = 0.35,
    color = "black",
    fill="#FFFFFF",
    size = 3
  ) +
  ## adding label for outliers: Kameido
  geom_label(
    label="Kameido", 
    x=2950,
    y=400,
    label.padding = unit(0.3, "lines"), # Rectangle size around label
    label.size = 0.35,
    color = "black",
    fill="#FFFFFF",
    size = 3
  ) 

ggsave(file.path(figure_dir, "d_female_nonordinaryHH.pdf"), 
       width = 4, 
       height = 4)



## ----------------------------------------------------------------------
##
## Table J1
## Result VII: Placebo regressions (Sport)
##
## ----------------------------------------------------------------------

## ----------------------------------------------------------------------
## Load data
## ----------------------------------------------------------------------
tokyo_census_tbl = data_dir	%>%	file.path("census_data4.csv")	%>%
  read_csv(col_types = cols())	%>%
  mutate(postwar_district23 = cityName ) %>%
  mutate(key_code = as.character(key_code)) %>%
  mutate(zipcode = str_replace(zipcode, "-", "")) %>%
  filter(year == 2015) 


## --- All years
tokyo_ninka_tbl = data_dir	%>%	file.path("test_ninka_2.dta")	%>%	read_dta()	%>%
  rename(
    dummy_ninka = d_ninka,
    ln_ninka_days = lnninka_days
  )
## --- Since 1993
tokyo_ninka_tbl_1993 = data_dir	%>%	file.path("test_ninka_sinceFY1993_2.dta")	%>%	read_dta()	%>%
  rename(
    dummy_ninka = d_ninka,
    ln_ninka_days = lnninka_days
  )
## --- Sports
# sports_tbl = root_dir	%>%	file.path("@@SUBMISSION/JOP_RR1/supporting_materials/R2_Q5a/sports_club_list.csv")	%>%
sports_tbl = data_dir	%>%	file.path("sports_club_list.csv")	%>%
  read_csv(col_types = cols())	%>%
  rename(
    club_name = 1, cityName = ku_name,
    chochoazaName = address_jp)	%>%
  mutate(
    chochoazaName = if_else(
      str_detect(chochoazaName, pattern = "\\d"),
      true = str_c(chochoazaName, "丁目"),
      false = chochoazaName),
    chochoazaName = stringi::stri_trans_general(chochoazaName, id = "Halfwidth-Fullwidth"),
    SportClubDummy = 1,
    zipcode = str_remove_all(zipcode, pattern = "-")
  ) 




## ----------------------------------------------------------------------
## Link function
## ----------------------------------------------------------------------
binom_link = binomial(link = logit)
count_link = quasipoisson
## ----------------------------------------------------------------------
## Outcome and treatment variables
## ----------------------------------------------------------------------
depvar_vec_combined = c("ninka_days", "ln_ninka_days", "dummy_ninka", "n_ninka")
placebo_depvar_vec = c("SportClubDummy", "LocalSportClubDummy")
treatment_vec = c("ratio_damage", "ln_damage_ratio", "binary_damage_ratio")
## ----------------------------------------------------------------------
## Covariates
## ----------------------------------------------------------------------
## District FE
fe_term = "DistrictFE"
## Geographical features
spatial_covariates = c(
  ## Residential ratio and geographical area
  "ratio_residential", "poly_area_ln", "n_neighbor",
  ## Prewar population density
  "PopDensity_1939_km2_ln",
  ## Terrain variables
  "mean_elevation_ln", "mean_slope_ln", "river_distance_ln",
  ## Railway and train stations
  "railway_length_ln", "n_stations", "SpLag_railway_length_ln", "SpLag_n_stations"
)
## ----------------------------------------------------------------------
## Location variables
distance_termz_base = c("palace_dist_ln", "minTargetDistance_ln")
polynomial_degree = 5
## --- Polynomials and splines (single dimension)
distance_term_specification = list(
  ## --- Polynomials
  str_c("poly(", distance_termz_base, ", degree = ", polynomial_degree, ", simple = TRUE, raw = TRUE)"),
  ## --- Splines
  str_c("s(", distance_termz_base, ")"),
  ## --- none (comparison purposes)
  "",
  ## --- 3rd order polynomials
  str_c("poly(", distance_termz_base, ", degree = 3, simple = TRUE, raw = TRUE)")
)
## --- Lon-lat (two-dimensional) term
spatial_term_specification = c(
  ## --- Polynomials
  str_c("poly(z_lon, z_lat, degree = ",  polynomial_degree, ", simple = TRUE, raw = TRUE)"),
  ## --- Splines
  "te(longitude, latitude)",
  ## --- none (comparison purposes)
  "",
  ## --- 3rd order polynomials
  "poly(z_lon, z_lat, degree = 3, simple = TRUE, raw = TRUE)"
)


## ----------------------------------------------------------------------
## Combine the post-1993 records and full records
## ----------------------------------------------------------------------
depvar_1993_tbl = tokyo_ninka_tbl_1993	%>%
  select(
    key_code, chochoazaName, all_of(depvar_vec_combined)
  )	%>%
  rename_at(vars(ninka_days:n_ninka), ~ str_c(., "_1993"))
## ----------------------------------------------------------------------
tmp_combined_tbl = tokyo_census_tbl	%>%
  left_join(
    tokyo_ninka_tbl	%>%
      select(key_code, chochoazaName, all_of(depvar_vec_combined))
  )	%>%
  left_join(depvar_1993_tbl, by = c("key_code", "chochoazaName"))	%>%
  left_join(sports_tbl, by = c("cityName", "chochoazaName", "zipcode"))	%>%
  rownames_to_column(var = "row_count")	%>%
  mutate(
    DistrictFE = as.factor(prewar_district),
    SportClubDummy = if_else(is.na(SportClubDummy), true = 0, false = SportClubDummy)
  )	%>%
  select(row_count, everything())

## ----------------------------------------------------------------------
## Spatially-lagged variables
## ----------------------------------------------------------------------
full_poly = data_dir	%>%	file.path("RaidShp_May2020b.rds")	%>%
  read_rds()	%>%	select(key_code, ratio_damage)	%>%
  left_join(tmp_combined_tbl	%>%	select(key_code, SportClubDummy))	%>%
  mutate(
    SportClubDummy = if_else(is.na(SportClubDummy), true = 0, false = SportClubDummy)
  )
## Use different SWM
full_contig_nb = poly2nb(full_poly, queen = TRUE)
full_contig_listw = nb2listw(full_contig_nb, style = "W", zero.policy = TRUE)	## 1 neighborhood without neighbors
## Lagged damage SWM
full_contig_nb_damage = poly2nb(full_poly	%>%	drop_na(ratio_damage), queen = TRUE)
full_contig_listw_damage = nb2listw(full_contig_nb_damage, style = "W", zero.policy = TRUE)	## 1 neighborhood without neighbors
## ----------------------------------------------------------------------
## Lagged damage
lagged_damage_tbl = full_poly	%>%	drop_na(ratio_damage)	%>%
  mutate(
    LaggedDamage = lag.listw(full_contig_listw_damage, var = full_poly	%>%	drop_na(ratio_damage)	%>%	pull(ratio_damage),
                             zero.policy = TRUE)
  )	%>%
  as_tibble()	%>%
  select(key_code, LaggedDamage)
## Construct lagged outcome
lagged_variables_tbl = full_poly	%>%
  mutate(
    LocalSportClubDummy = lag.listw(full_contig_listw, var = full_poly$SportClubDummy, zero.policy = TRUE, NAOK = TRUE),
    LocalSportClubDummy = if_else(SportClubDummy + LocalSportClubDummy > 0, true = 1, false = 0)
  )	%>%
  as_tibble()	%>%
  select(key_code, SportClubDummy, LocalSportClubDummy)	%>%
  left_join(lagged_damage_tbl, by = "key_code")
lagged_variables_tbl	%>%	summary()


## ----------------------------------------------------------------------
## Combine and save
## ----------------------------------------------------------------------
combined_tbl = tmp_combined_tbl	%>%	left_join(lagged_variables_tbl)	%>%
  mutate(
    zipcode = str_c(str_sub(zipcode, start = 1, end = 3), "-", str_sub(zipcode, start = 4, end = 7)),
    zipcode = if_else(zipcode == "962-0838", true = "173-0027", false = zipcode)
  )	%>%
  select(
    row_id, row_count:pop, zipcode, -year,
    all_of(depvar_vec_combined), contains("_1993"),
    all_of(placebo_depvar_vec), all_of(treatment_vec), LaggedDamage,
    all_of(fe_term), all_of(spatial_covariates), all_of(distance_termz_base),
    z_lon, z_lat, longitude, latitude,
    palace_dist)	%>%
  drop_na(ratio_damage)
  

## ----------------------------------------------------------------------
## Baseline formula objects
## ----------------------------------------------------------------------
mdl_lst = list(
  ## Polynomial: felm()-compatible specification
  polynomial_base_felm = chr2fml_felm(
    "outcome",
    idv_list = list(
      treatment_vec[1],
      spatial_covariates,
      distance_term_specification[[1]],
      spatial_term_specification[1]
    ),
    fe_list = fe_term,
    se_cluster = "row_count"	## Robust SE
    # se_cluster = fe_term	## Cluster SE
  ),
  ## Spline: gam()-compatible specification
  gam_base = chr2fml(
    "outcome",
    idv_list = list(
      treatment_vec[1],
      spatial_covariates, fe_term,
      distance_term_specification[[2]],
      spatial_term_specification[2]
    )
  )
)
## ----------------------------------------------------------------------


## ----------------------------------------------------------------------
## Result list object
## ----------------------------------------------------------------------
rslt_lst = list(
  ## ----------------------------------------------------------------------
  ## 1a. DV = Sport, LPM
  Sport_LPM = mdl_lst[[1]]	%>%
    update(
      str_c(placebo_depvar_vec[1], " ~ .")	%>%	as.formula()	## formula obj to feed felm()
    )	%>%
    felm(data = combined_tbl, keepCX = TRUE, psdef = FALSE),
  ## ----------------------------------------------------------------------
  ## 1b: DV = Sport, probit/logit link
  Sport_GAM = mdl_lst[[2]]	%>%
    update(str_c(placebo_depvar_vec[1], " ~ ."))	%>%
    gam(data = combined_tbl, family = binom_link),
  ## ----------------------------------------------------------------------
  ## 2a: DV = Sport (own or neighbors), LPM
  LocalSport_LPM = mdl_lst[[1]]	%>%
    update(
      str_c(placebo_depvar_vec[2], " ~ .")	%>%	as.formula()	## formula obj to feed felm()
    )	%>%
    felm(data = combined_tbl, keepCX = TRUE, psdef = FALSE),
  ## ----------------------------------------------------------------------
  ## 2b: DV = Sport (own or neighbors), probit/logit link
  LocalSport_GAM = mdl_lst[[2]]	%>%
    update(str_c(placebo_depvar_vec[2], " ~ ."))	%>%
    gam(data = combined_tbl, family = binom_link)
  ## ----------------------------------------------------------------------
)
## ----------------------------------------------------------------------
## Print regression estimates
## ----------------------------------------------------------------------
stargazer(
  rslt_lst,
  type = "text",
  align = TRUE,
  keep = c(treatment_vec)
)
## ----------------------------------------------------------------------


## EOF

